import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_percentage_error
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.api import SimpleExpSmoothing, Holt
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from statsmodels.tsa.seasonal import seasonal_decompose
from pmdarima import auto_arima
from statsmodels.tsa.statespace.sarimax import SARIMAX
from prophet import Prophet
df = pd.read_csv('daily-website-visitors.csv', index_col = 'Date', parse_dates = True)
df.index.freq = 'D'
df
| Row | Day | Day.Of.Week | Page.Loads | Unique.Visits | First.Time.Visits | Returning.Visits | |
|---|---|---|---|---|---|---|---|
| Date | |||||||
| 2014-09-14 | 1 | Sunday | 1 | 2,146 | 1,582 | 1,430 | 152 |
| 2014-09-15 | 2 | Monday | 2 | 3,621 | 2,528 | 2,297 | 231 |
| 2014-09-16 | 3 | Tuesday | 3 | 3,698 | 2,630 | 2,352 | 278 |
| 2014-09-17 | 4 | Wednesday | 4 | 3,667 | 2,614 | 2,327 | 287 |
| 2014-09-18 | 5 | Thursday | 5 | 3,316 | 2,366 | 2,130 | 236 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 2020-08-15 | 2163 | Saturday | 7 | 2,221 | 1,696 | 1,373 | 323 |
| 2020-08-16 | 2164 | Sunday | 1 | 2,724 | 2,037 | 1,686 | 351 |
| 2020-08-17 | 2165 | Monday | 2 | 3,456 | 2,638 | 2,181 | 457 |
| 2020-08-18 | 2166 | Tuesday | 3 | 3,581 | 2,683 | 2,184 | 499 |
| 2020-08-19 | 2167 | Wednesday | 4 | 2,064 | 1,564 | 1,297 | 267 |
2167 rows × 7 columns
df['First.Time.Visits'] = df['First.Time.Visits'].str.replace(',', '')
df['First.Time.Visits'] = df['First.Time.Visits'].astype(int)
plt.rc("figure", figsize = (16, 8))
df['First.Time.Visits'].plot()
<AxesSubplot:xlabel='Date'>
resultSeasonal = seasonal_decompose(df['First.Time.Visits'], freq = 365)
resultSeasonal.plot();
C:\Users\Rudy\AppData\Local\Temp/ipykernel_12076/3293344013.py:1: FutureWarning: the 'freq'' keyword is deprecated, use 'period' instead resultSeasonal = seasonal_decompose(df['First.Time.Visits'], freq = 365)
df1 = df['First.Time.Visits'].resample('W').mean()
df1
Date
2014-09-14 1430.000000
2014-09-21 1884.857143
2014-09-28 2231.571429
2014-10-05 2131.428571
2014-10-12 2300.142857
...
2020-07-26 2226.714286
2020-08-02 2138.714286
2020-08-09 2032.857143
2020-08-16 2051.857143
2020-08-23 1887.333333
Freq: W-SUN, Name: First.Time.Visits, Length: 311, dtype: float64
df1.plot()
resultSeasonal = seasonal_decompose(df1)
resultSeasonal.plot();
df['First.Time.Visits'].head(21).plot()
<AxesSubplot:xlabel='Date'>
resultSeasonal = seasonal_decompose(df['First.Time.Visits'], freq = 7)
resultSeasonal.plot();
C:\Users\Rudy\AppData\Local\Temp/ipykernel_12076/4102391038.py:1: FutureWarning: the 'freq'' keyword is deprecated, use 'period' instead resultSeasonal = seasonal_decompose(df['First.Time.Visits'], freq = 7)
adfuller(df1) # null hypothesis: time series is non-stationary
(-5.361144359532205,
4.08540684894091e-06,
7,
303,
{'1%': -3.4521175397304784,
'5%': -2.8711265007266666,
'10%': -2.571877823851692},
4102.702546858949)
plot_acf(df1, zero = False);
plot_pacf(df1, zero = False);
auto_arima(df1, seasonal = True, m = 52, trace = True).summary()
Performing stepwise search to minimize aic ARIMA(2,0,2)(1,0,1)[52] intercept : AIC=4217.971, Time=4.94 sec ARIMA(0,0,0)(0,0,0)[52] intercept : AIC=4867.836, Time=0.01 sec ARIMA(1,0,0)(1,0,0)[52] intercept : AIC=inf, Time=2.30 sec ARIMA(0,0,1)(0,0,1)[52] intercept : AIC=4517.104, Time=1.33 sec ARIMA(0,0,0)(0,0,0)[52] : AIC=5751.546, Time=0.01 sec ARIMA(2,0,2)(0,0,1)[52] intercept : AIC=4277.397, Time=5.24 sec ARIMA(2,0,2)(1,0,0)[52] intercept : AIC=inf, Time=4.38 sec ARIMA(2,0,2)(2,0,1)[52] intercept : AIC=inf, Time=21.09 sec ARIMA(2,0,2)(1,0,2)[52] intercept : AIC=4208.140, Time=20.30 sec ARIMA(2,0,2)(0,0,2)[52] intercept : AIC=inf, Time=14.14 sec ARIMA(2,0,2)(2,0,2)[52] intercept : AIC=inf, Time=23.91 sec ARIMA(1,0,2)(1,0,2)[52] intercept : AIC=inf, Time=18.09 sec ARIMA(2,0,1)(1,0,2)[52] intercept : AIC=4220.713, Time=17.23 sec ARIMA(3,0,2)(1,0,2)[52] intercept : AIC=inf, Time=24.18 sec ARIMA(2,0,3)(1,0,2)[52] intercept : AIC=4205.615, Time=17.06 sec ARIMA(2,0,3)(0,0,2)[52] intercept : AIC=inf, Time=9.65 sec ARIMA(2,0,3)(1,0,1)[52] intercept : AIC=4214.717, Time=5.55 sec ARIMA(2,0,3)(2,0,2)[52] intercept : AIC=inf, Time=20.86 sec ARIMA(2,0,3)(0,0,1)[52] intercept : AIC=4268.738, Time=2.73 sec ARIMA(2,0,3)(2,0,1)[52] intercept : AIC=inf, Time=22.84 sec ARIMA(1,0,3)(1,0,2)[52] intercept : AIC=4201.909, Time=15.97 sec ARIMA(1,0,3)(0,0,2)[52] intercept : AIC=inf, Time=7.59 sec ARIMA(1,0,3)(1,0,1)[52] intercept : AIC=4229.956, Time=4.96 sec ARIMA(1,0,3)(2,0,2)[52] intercept : AIC=inf, Time=19.90 sec ARIMA(1,0,3)(0,0,1)[52] intercept : AIC=4268.375, Time=2.00 sec ARIMA(1,0,3)(2,0,1)[52] intercept : AIC=inf, Time=19.64 sec ARIMA(0,0,3)(1,0,2)[52] intercept : AIC=4294.452, Time=11.45 sec ARIMA(1,0,4)(1,0,2)[52] intercept : AIC=4208.445, Time=13.78 sec ARIMA(0,0,2)(1,0,2)[52] intercept : AIC=inf, Time=16.32 sec ARIMA(0,0,4)(1,0,2)[52] intercept : AIC=inf, Time=15.00 sec ARIMA(2,0,4)(1,0,2)[52] intercept : AIC=inf, Time=21.88 sec ARIMA(1,0,3)(1,0,2)[52] : AIC=4182.514, Time=14.88 sec ARIMA(1,0,3)(0,0,2)[52] : AIC=inf, Time=10.33 sec ARIMA(1,0,3)(1,0,1)[52] : AIC=4183.083, Time=4.86 sec ARIMA(1,0,3)(2,0,2)[52] : AIC=inf, Time=17.37 sec ARIMA(1,0,3)(0,0,1)[52] : AIC=4281.854, Time=2.83 sec ARIMA(1,0,3)(2,0,1)[52] : AIC=inf, Time=19.94 sec ARIMA(0,0,3)(1,0,2)[52] : AIC=inf, Time=12.56 sec ARIMA(1,0,2)(1,0,2)[52] : AIC=4190.084, Time=14.52 sec ARIMA(2,0,3)(1,0,2)[52] : AIC=4197.004, Time=17.39 sec ARIMA(1,0,4)(1,0,2)[52] : AIC=4182.458, Time=19.50 sec ARIMA(1,0,4)(0,0,2)[52] : AIC=inf, Time=12.75 sec ARIMA(1,0,4)(1,0,1)[52] : AIC=4184.601, Time=5.43 sec ARIMA(1,0,4)(2,0,2)[52] : AIC=inf, Time=21.46 sec ARIMA(1,0,4)(0,0,1)[52] : AIC=4283.704, Time=3.83 sec ARIMA(1,0,4)(2,0,1)[52] : AIC=4190.786, Time=21.35 sec ARIMA(0,0,4)(1,0,2)[52] : AIC=inf, Time=29.35 sec ARIMA(2,0,4)(1,0,2)[52] : AIC=4187.035, Time=20.01 sec ARIMA(1,0,5)(1,0,2)[52] : AIC=4183.595, Time=21.20 sec ARIMA(0,0,5)(1,0,2)[52] : AIC=inf, Time=10.66 sec ARIMA(2,0,5)(1,0,2)[52] : AIC=4185.555, Time=21.98 sec Best model: ARIMA(1,0,4)(1,0,2)[52] Total fit time: 686.632 seconds
| Dep. Variable: | y | No. Observations: | 311 |
|---|---|---|---|
| Model: | SARIMAX(1, 0, 4)x(1, 0, [1, 2], 52) | Log Likelihood | -2082.229 |
| Date: | Fri, 13 May 2022 | AIC | 4182.458 |
| Time: | 14:40:36 | BIC | 4216.116 |
| Sample: | 0 | HQIC | 4195.911 |
| - 311 | |||
| Covariance Type: | opg |
| coef | std err | z | P>|z| | [0.025 | 0.975] | |
|---|---|---|---|---|---|---|
| ar.L1 | 0.9835 | 0.013 | 76.834 | 0.000 | 0.958 | 1.009 |
| ma.L1 | 0.0256 | 0.037 | 0.684 | 0.494 | -0.048 | 0.099 |
| ma.L2 | -0.1762 | 0.055 | -3.204 | 0.001 | -0.284 | -0.068 |
| ma.L3 | -0.2057 | 0.055 | -3.711 | 0.000 | -0.314 | -0.097 |
| ma.L4 | 0.0913 | 0.064 | 1.436 | 0.151 | -0.033 | 0.216 |
| ar.S.L52 | 0.8940 | 0.025 | 36.114 | 0.000 | 0.845 | 0.943 |
| ma.S.L52 | -0.4022 | 0.073 | -5.508 | 0.000 | -0.545 | -0.259 |
| ma.S.L104 | 0.1289 | 0.070 | 1.844 | 0.065 | -0.008 | 0.266 |
| sigma2 | 3.104e+04 | 1897.797 | 16.358 | 0.000 | 2.73e+04 | 3.48e+04 |
| Ljung-Box (L1) (Q): | 0.04 | Jarque-Bera (JB): | 146.51 |
|---|---|---|---|
| Prob(Q): | 0.85 | Prob(JB): | 0.00 |
| Heteroskedasticity (H): | 1.66 | Skew: | -0.41 |
| Prob(H) (two-sided): | 0.01 | Kurtosis: | 6.26 |
train_data = df1.iloc[:-4]
test_data = df1.iloc[-4:]
fitSES = SimpleExpSmoothing(train_data).fit()
fcastSES = fitSES.forecast(len(test_data)).rename('SES predict')
print(mean_absolute_percentage_error(test_data, fcastSES))
0.10022964314802252
c:\Users\Rudy\anaconda3\lib\site-packages\statsmodels\tsa\holtwinters\model.py:427: FutureWarning: After 0.13 initialization must be handled at model creation warnings.warn( c:\Users\Rudy\anaconda3\lib\site-packages\statsmodels\tsa\base\tsa_model.py:132: FutureWarning: The 'freq' argument in Timestamp is deprecated and will be removed in a future version. date_key = Timestamp(key, freq=base_index.freq)
fitHolt1 = Holt(train_data, exponential = False).fit()
fcastHolt1 = fitHolt1.forecast(len(test_data)).rename("Holt's predict 1")
#Exponential
fitHolt2 = Holt(train_data, exponential = True).fit()
fcastHolt2 = fitHolt2.forecast(len(test_data)).rename("Holt's predict 2")
#Damped trend
fitHolt3 = Holt(train_data, damped_trend = True).fit()
fcastHolt3 = fitHolt3.forecast(len(test_data)).rename("Holt's predict 3")
#Exponential and damped
fitHolt4 = Holt(train_data, exponential = True, damped_trend = True).fit()
fcastHolt4 = fitHolt4.forecast(len(test_data)).rename("Holt's predict 4")
#finding out the best one based on MAPE
print(mean_absolute_percentage_error(test_data, fcastHolt1))
print(mean_absolute_percentage_error(test_data, fcastHolt2))
print(mean_absolute_percentage_error(test_data, fcastHolt3))
print(mean_absolute_percentage_error(test_data, fcastHolt4))
c:\Users\Rudy\anaconda3\lib\site-packages\statsmodels\tsa\holtwinters\model.py:427: FutureWarning: After 0.13 initialization must be handled at model creation warnings.warn( c:\Users\Rudy\anaconda3\lib\site-packages\statsmodels\tsa\base\tsa_model.py:132: FutureWarning: The 'freq' argument in Timestamp is deprecated and will be removed in a future version. date_key = Timestamp(key, freq=base_index.freq)
0.05382450791912963 0.04204660701023219 0.06422058175824172 0.04272137927648041
#multiplicative for both
fitHoltWinter1 = ExponentialSmoothing(train_data, trend = 'mul', seasonal = 'mul', seasonal_periods = 52).fit()
fcastHoltWinter1 = fitHoltWinter1.forecast(len(test_data)).rename("Holt-Winter's predict 1")
#mul trend, add seasonal
fitHoltWinter2 = ExponentialSmoothing(train_data, trend = 'mul', seasonal = 'add', seasonal_periods = 52).fit()
fcastHoltWinter2 = fitHoltWinter2.forecast(len(test_data)).rename("Holt-Winter's predict 2")
#add trend, mul seasonal
fitHoltWinter3 = ExponentialSmoothing(train_data, trend = 'add', seasonal = 'mul', seasonal_periods = 52).fit()
fcastHoltWinter3 = fitHoltWinter3.forecast(len(test_data)).rename("Holt-Winter's predict 3")
#both additive
fitHoltWinter4 = ExponentialSmoothing(train_data, trend = 'add', seasonal = 'add', seasonal_periods = 52).fit()
fcastHoltWinter4 = fitHoltWinter4.forecast(len(test_data)).rename("Holt-Winter's predict 4")
print(mean_absolute_percentage_error(test_data, fcastHoltWinter1))
print(mean_absolute_percentage_error(test_data, fcastHoltWinter2))
print(mean_absolute_percentage_error(test_data, fcastHoltWinter3))
print(mean_absolute_percentage_error(test_data, fcastHoltWinter4))
c:\Users\Rudy\anaconda3\lib\site-packages\statsmodels\tsa\holtwinters\model.py:427: FutureWarning: After 0.13 initialization must be handled at model creation warnings.warn( c:\Users\Rudy\anaconda3\lib\site-packages\statsmodels\tsa\base\tsa_model.py:132: FutureWarning: The 'freq' argument in Timestamp is deprecated and will be removed in a future version. date_key = Timestamp(key, freq=base_index.freq)
0.1265903043287256 0.11988441141378328 0.08348884911604443 0.1085159275419316
fitSARIMA = SARIMAX(train_data, order = (1, 0, 4), seasonal_order = (1, 0, 2, 52)).fit()
fitSARIMA.summary()
c:\Users\Rudy\anaconda3\lib\site-packages\statsmodels\tsa\statespace\sarimax.py:866: UserWarning: Too few observations to estimate starting parameters for seasonal ARMA. All parameters except for variances will be set to zeros.
warn('Too few observations to estimate starting parameters%s.'
c:\Users\Rudy\anaconda3\lib\site-packages\statsmodels\base\model.py:566: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
warnings.warn("Maximum Likelihood optimization failed to "
| Dep. Variable: | First.Time.Visits | No. Observations: | 307 |
|---|---|---|---|
| Model: | SARIMAX(1, 0, 4)x(1, 0, [1, 2], 52) | Log Likelihood | -2056.638 |
| Date: | Fri, 13 May 2022 | AIC | 4131.276 |
| Time: | 14:40:56 | BIC | 4164.818 |
| Sample: | 09-14-2014 | HQIC | 4144.689 |
| - 07-26-2020 | |||
| Covariance Type: | opg |
| coef | std err | z | P>|z| | [0.025 | 0.975] | |
|---|---|---|---|---|---|---|
| ar.L1 | 0.9846 | 0.013 | 77.523 | 0.000 | 0.960 | 1.009 |
| ma.L1 | 0.0246 | 0.038 | 0.653 | 0.514 | -0.049 | 0.098 |
| ma.L2 | -0.1800 | 0.056 | -3.241 | 0.001 | -0.289 | -0.071 |
| ma.L3 | -0.2150 | 0.056 | -3.840 | 0.000 | -0.325 | -0.105 |
| ma.L4 | 0.0903 | 0.064 | 1.413 | 0.158 | -0.035 | 0.216 |
| ar.S.L52 | 0.8940 | 0.025 | 35.736 | 0.000 | 0.845 | 0.943 |
| ma.S.L52 | -0.4089 | 0.074 | -5.519 | 0.000 | -0.554 | -0.264 |
| ma.S.L104 | 0.1319 | 0.071 | 1.868 | 0.062 | -0.006 | 0.270 |
| sigma2 | 3.121e+04 | 1917.299 | 16.281 | 0.000 | 2.75e+04 | 3.5e+04 |
| Ljung-Box (L1) (Q): | 0.03 | Jarque-Bera (JB): | 147.19 |
|---|---|---|---|
| Prob(Q): | 0.86 | Prob(JB): | 0.00 |
| Heteroskedasticity (H): | 1.65 | Skew: | -0.43 |
| Prob(H) (two-sided): | 0.01 | Kurtosis: | 6.28 |
start = len(train_data)
end = start + len(test_data) - 1
fcastSARIMA = fitSARIMA.predict(start = start, end = end, dynamic = False).rename('SARIMA')
print(mean_absolute_percentage_error(test_data, fcastSARIMA))
0.10491919751770648
data = train_data.reset_index()[['Date', 'First.Time.Visits']]
data.columns = ['ds', 'y']
data
| ds | y | |
|---|---|---|
| 0 | 2014-09-14 | 1430.000000 |
| 1 | 2014-09-21 | 1884.857143 |
| 2 | 2014-09-28 | 2231.571429 |
| 3 | 2014-10-05 | 2131.428571 |
| 4 | 2014-10-12 | 2300.142857 |
| ... | ... | ... |
| 302 | 2020-06-28 | 2528.000000 |
| 303 | 2020-07-05 | 2193.285714 |
| 304 | 2020-07-12 | 2351.428571 |
| 305 | 2020-07-19 | 2162.142857 |
| 306 | 2020-07-26 | 2226.714286 |
307 rows × 2 columns
fitProphet = Prophet(yearly_seasonality = True)
fitProphet.fit(data)
future = fitProphet.make_future_dataframe(len(test_data), freq = 'W')
fcastProphet = fitProphet.predict(future)
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
prophetForecast = fcastProphet[['ds', 'yhat']].iloc[-len(test_data):]
prophetForecast.set_index('ds', inplace = True)
prophetForecast.index.name = 'Date'
prophetForecast.columns = ['First.Time.Visits']
prophetForecast
| First.Time.Visits | |
|---|---|
| Date | |
| 2020-08-02 | 2182.351847 |
| 2020-08-09 | 2180.860046 |
| 2020-08-16 | 2215.797885 |
| 2020-08-23 | 2248.347887 |
mean_absolute_percentage_error(test_data, prophetForecast['First.Time.Visits'])
0.09109764900946124
test_data.plot(legend = True)
fcastSES.plot(legend = True)
fcastHolt2.plot(legend = True)
fcastHoltWinter3.plot(legend = True)
fcastSARIMA.plot(legend = True)
prophetForecast['First.Time.Visits'].plot(legend = True)
<AxesSubplot:xlabel='Date'>
#Simple exponential smoothing
fitSES = SimpleExpSmoothing(df1).fit()
fcastSES = fitSES.forecast(4).rename('SES predict')
#Holt's method
fitHolt = Holt(df1, exponential = True).fit()
fcastHolt = fitHolt.forecast(4).rename("Holt's predict")
#Holt-Winter's method
fitHoltWinter = ExponentialSmoothing(df1, trend = 'add', seasonal = 'mul', seasonal_periods = 52).fit()
fcastHoltWinter = fitHoltWinter.forecast(4).rename("Holt-Winter's predict")
#SARIMA
fitSARIMA = SARIMAX(df1, order = (1, 0, 4), seasonal_order = (1, 0, 2, 52)).fit()
start = len(df1)
end = start + 3
fcastSARIMA = fitSARIMA.predict(start = start, end = end, dynamic = False).rename('SARIMA')
#Prophet
data = df1.reset_index()[['Date', 'First.Time.Visits']]
data.columns = ['ds', 'y']
fitProphet = Prophet(yearly_seasonality = True)
fitProphet.fit(data)
future = fitProphet.make_future_dataframe(4, freq = 'W')
fcastProphet = fitProphet.predict(future)
#going back to normal dataframe from prophet
prophetForecast = fcastProphet[['ds', 'yhat']].iloc[-4:]
prophetForecast.set_index('ds', inplace = True)
prophetForecast.index.name = 'Date'
prophetForecast.columns = ['First.Time.Visits']
prophetForecast
c:\Users\Rudy\anaconda3\lib\site-packages\statsmodels\tsa\holtwinters\model.py:427: FutureWarning: After 0.13 initialization must be handled at model creation
warnings.warn(
c:\Users\Rudy\anaconda3\lib\site-packages\statsmodels\tsa\base\tsa_model.py:132: FutureWarning: The 'freq' argument in Timestamp is deprecated and will be removed in a future version.
date_key = Timestamp(key, freq=base_index.freq)
c:\Users\Rudy\anaconda3\lib\site-packages\statsmodels\tsa\statespace\sarimax.py:866: UserWarning: Too few observations to estimate starting parameters for seasonal ARMA. All parameters except for variances will be set to zeros.
warn('Too few observations to estimate starting parameters%s.'
c:\Users\Rudy\anaconda3\lib\site-packages\statsmodels\base\model.py:566: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
warnings.warn("Maximum Likelihood optimization failed to "
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
| First.Time.Visits | |
|---|---|
| Date | |
| 2020-08-30 | 2231.360981 |
| 2020-09-06 | 2316.522366 |
| 2020-09-13 | 2498.980460 |
| 2020-09-20 | 2738.991903 |
df1.tail(54).plot(legend = True)
fcastSES.plot(legend = True)
fcastHolt.plot(legend = True)
fcastHoltWinter.plot(legend = True)
fcastSARIMA.plot(legend = True)
prophetForecast['First.Time.Visits'].plot(legend = True)
<AxesSubplot:xlabel='Date'>
df1 = df['First.Time.Visits'].diff(365).dropna()
df1
Date
2015-09-14 1524.0
2015-09-15 647.0
2015-09-16 550.0
2015-09-17 409.0
2015-09-18 -106.0
...
2020-08-15 -290.0
2020-08-16 612.0
2020-08-17 704.0
2020-08-18 -187.0
2020-08-19 -996.0
Freq: D, Name: First.Time.Visits, Length: 1802, dtype: float64
plt.rc("figure", figsize = (16, 8))
df1.plot()
resultSeasonal = seasonal_decompose(df1, freq = 7)
resultSeasonal.plot();
C:\Users\Rudy\AppData\Local\Temp/ipykernel_12076/4060646942.py:3: FutureWarning: the 'freq'' keyword is deprecated, use 'period' instead resultSeasonal = seasonal_decompose(df1, freq = 7)
adfuller(df1) # null hypothesis: time series is non-stationary
(-2.8736866944947317,
0.048494556954374046,
22,
1779,
{'1%': -3.434031147125674,
'5%': -2.863166021720215,
'10%': -2.5676356430449427},
24463.68985856995)
plot_acf(df1, zero = False);
plot_pacf(df1, zero = False);
auto_arima(df1, seasonal = True, m = 7, trace = True).summary()
Performing stepwise search to minimize aic ARIMA(2,1,2)(1,0,1)[7] intercept : AIC=inf, Time=2.96 sec ARIMA(0,1,0)(0,0,0)[7] intercept : AIC=28475.578, Time=0.02 sec ARIMA(1,1,0)(1,0,0)[7] intercept : AIC=25530.533, Time=1.63 sec ARIMA(0,1,1)(0,0,1)[7] intercept : AIC=27100.502, Time=1.05 sec ARIMA(0,1,0)(0,0,0)[7] : AIC=28473.587, Time=0.01 sec ARIMA(1,1,0)(0,0,0)[7] intercept : AIC=28473.196, Time=0.05 sec ARIMA(1,1,0)(2,0,0)[7] intercept : AIC=inf, Time=2.74 sec ARIMA(1,1,0)(1,0,1)[7] intercept : AIC=25063.454, Time=1.77 sec ARIMA(1,1,0)(0,0,1)[7] intercept : AIC=27237.170, Time=0.55 sec ARIMA(1,1,0)(2,0,1)[7] intercept : AIC=inf, Time=4.13 sec ARIMA(1,1,0)(1,0,2)[7] intercept : AIC=25012.985, Time=3.77 sec ARIMA(1,1,0)(0,0,2)[7] intercept : AIC=26619.572, Time=1.06 sec ARIMA(1,1,0)(2,0,2)[7] intercept : AIC=inf, Time=1.49 sec ARIMA(0,1,0)(1,0,2)[7] intercept : AIC=inf, Time=2.87 sec ARIMA(2,1,0)(1,0,2)[7] intercept : AIC=inf, Time=3.81 sec ARIMA(1,1,1)(1,0,2)[7] intercept : AIC=inf, Time=8.68 sec ARIMA(0,1,1)(1,0,2)[7] intercept : AIC=24965.644, Time=7.97 sec ARIMA(0,1,1)(0,0,2)[7] intercept : AIC=26555.181, Time=4.49 sec ARIMA(0,1,1)(1,0,1)[7] intercept : AIC=24980.414, Time=2.53 sec ARIMA(0,1,1)(2,0,2)[7] intercept : AIC=inf, Time=8.71 sec ARIMA(0,1,1)(2,0,1)[7] intercept : AIC=inf, Time=3.42 sec ARIMA(0,1,2)(1,0,2)[7] intercept : AIC=24780.373, Time=9.18 sec ARIMA(0,1,2)(0,0,2)[7] intercept : AIC=26040.238, Time=5.25 sec ARIMA(0,1,2)(1,0,1)[7] intercept : AIC=24830.572, Time=2.55 sec ARIMA(0,1,2)(2,0,2)[7] intercept : AIC=inf, Time=9.73 sec ARIMA(0,1,2)(0,0,1)[7] intercept : AIC=26548.849, Time=1.42 sec ARIMA(0,1,2)(2,0,1)[7] intercept : AIC=inf, Time=4.20 sec ARIMA(1,1,2)(1,0,2)[7] intercept : AIC=inf, Time=4.68 sec ARIMA(0,1,3)(1,0,2)[7] intercept : AIC=inf, Time=9.91 sec ARIMA(1,1,3)(1,0,2)[7] intercept : AIC=inf, Time=16.49 sec ARIMA(0,1,2)(1,0,2)[7] : AIC=inf, Time=3.40 sec Best model: ARIMA(0,1,2)(1,0,2)[7] intercept Total fit time: 130.533 seconds
| Dep. Variable: | y | No. Observations: | 1802 |
|---|---|---|---|
| Model: | SARIMAX(0, 1, 2)x(1, 0, 2, 7) | Log Likelihood | -12383.187 |
| Date: | Fri, 13 May 2022 | AIC | 24780.373 |
| Time: | 14:43:30 | BIC | 24818.846 |
| Sample: | 0 | HQIC | 24794.574 |
| - 1802 | |||
| Covariance Type: | opg |
| coef | std err | z | P>|z| | [0.025 | 0.975] | |
|---|---|---|---|---|---|---|
| intercept | 0.0016 | 0.026 | 0.061 | 0.952 | -0.050 | 0.053 |
| ma.L1 | -0.3734 | 0.018 | -20.547 | 0.000 | -0.409 | -0.338 |
| ma.L2 | -0.3100 | 0.019 | -16.103 | 0.000 | -0.348 | -0.272 |
| ar.S.L7 | 1.0000 | 3.84e-06 | 2.61e+05 | 0.000 | 1.000 | 1.000 |
| ma.S.L7 | -0.7406 | 0.020 | -37.316 | 0.000 | -0.780 | -0.702 |
| ma.S.L14 | -0.1492 | 0.019 | -7.866 | 0.000 | -0.186 | -0.112 |
| sigma2 | 5.189e+04 | 6.72e-08 | 7.73e+11 | 0.000 | 5.19e+04 | 5.19e+04 |
| Ljung-Box (L1) (Q): | 1.46 | Jarque-Bera (JB): | 284.38 |
|---|---|---|---|
| Prob(Q): | 0.23 | Prob(JB): | 0.00 |
| Heteroskedasticity (H): | 1.10 | Skew: | -0.28 |
| Prob(H) (two-sided): | 0.25 | Kurtosis: | 4.87 |
train_data = df1.iloc[:-30]
test_data = df1.iloc[-30:]
fitSES2 = SimpleExpSmoothing(train_data).fit()
fcastSES2 = fitSES2.forecast(len(test_data)).rename('SES predict')
print(mean_absolute_percentage_error(test_data, fcastSES2))
5.660745756341342
c:\Users\Rudy\anaconda3\lib\site-packages\statsmodels\tsa\holtwinters\model.py:427: FutureWarning: After 0.13 initialization must be handled at model creation warnings.warn( c:\Users\Rudy\anaconda3\lib\site-packages\statsmodels\tsa\holtwinters\model.py:920: ConvergenceWarning: Optimization failed to converge. Check mle_retvals. warnings.warn( c:\Users\Rudy\anaconda3\lib\site-packages\statsmodels\tsa\base\tsa_model.py:132: FutureWarning: The 'freq' argument in Timestamp is deprecated and will be removed in a future version. date_key = Timestamp(key, freq=base_index.freq)
fitHolt1_2 = Holt(train_data, exponential = False).fit()
fcastHolt1_2 = fitHolt1_2.forecast(len(test_data)).rename("Holt's predict 1")
#Damped trend
fitHolt2_2 = Holt(train_data, damped_trend = True).fit()
fcastHolt2_2 = fitHolt2_2.forecast(len(test_data)).rename("Holt's predict 2")
#finding out the best one based on MAPE
print(mean_absolute_percentage_error(test_data, fcastHolt1_2))
print(mean_absolute_percentage_error(test_data, fcastHolt2_2))
15.063487981603364 14.868458425130997
c:\Users\Rudy\anaconda3\lib\site-packages\statsmodels\tsa\holtwinters\model.py:427: FutureWarning: After 0.13 initialization must be handled at model creation warnings.warn( c:\Users\Rudy\anaconda3\lib\site-packages\statsmodels\tsa\holtwinters\model.py:920: ConvergenceWarning: Optimization failed to converge. Check mle_retvals. warnings.warn( c:\Users\Rudy\anaconda3\lib\site-packages\statsmodels\tsa\base\tsa_model.py:132: FutureWarning: The 'freq' argument in Timestamp is deprecated and will be removed in a future version. date_key = Timestamp(key, freq=base_index.freq) c:\Users\Rudy\anaconda3\lib\site-packages\statsmodels\tsa\holtwinters\model.py:920: ConvergenceWarning: Optimization failed to converge. Check mle_retvals. warnings.warn(
#both additive
fitHoltWinter2 = ExponentialSmoothing(train_data, trend = 'add', seasonal = 'add', seasonal_periods = 7).fit()
fcastHoltWinter2 = fitHoltWinter2.forecast(len(test_data)).rename("Holt-Winter's predict")
print(mean_absolute_percentage_error(test_data, fcastHoltWinter2))
c:\Users\Rudy\anaconda3\lib\site-packages\statsmodels\tsa\holtwinters\model.py:427: FutureWarning: After 0.13 initialization must be handled at model creation warnings.warn( c:\Users\Rudy\anaconda3\lib\site-packages\statsmodels\tsa\base\tsa_model.py:132: FutureWarning: The 'freq' argument in Timestamp is deprecated and will be removed in a future version. date_key = Timestamp(key, freq=base_index.freq)
1.828931007613234
fitSARIMA2 = SARIMAX(train_data, order = (0, 1, 2), seasonal_order = (1, 0, 2, 7)).fit()
fitSARIMA2.summary()
c:\Users\Rudy\anaconda3\lib\site-packages\statsmodels\tsa\statespace\sarimax.py:978: UserWarning: Non-invertible starting MA parameters found. Using zeros as starting parameters.
warn('Non-invertible starting MA parameters found.'
| Dep. Variable: | First.Time.Visits | No. Observations: | 1772 |
|---|---|---|---|
| Model: | SARIMAX(0, 1, 2)x(1, 0, 2, 7) | Log Likelihood | -12177.213 |
| Date: | Fri, 13 May 2022 | AIC | 24366.426 |
| Time: | 14:43:35 | BIC | 24399.302 |
| Sample: | 09-14-2015 | HQIC | 24378.571 |
| - 07-20-2020 | |||
| Covariance Type: | opg |
| coef | std err | z | P>|z| | [0.025 | 0.975] | |
|---|---|---|---|---|---|---|
| ma.L1 | -0.3757 | 0.019 | -20.047 | 0.000 | -0.412 | -0.339 |
| ma.L2 | -0.3175 | 0.020 | -15.969 | 0.000 | -0.356 | -0.279 |
| ar.S.L7 | 1.0000 | 2.35e-06 | 4.26e+05 | 0.000 | 1.000 | 1.000 |
| ma.S.L7 | -0.7387 | 0.020 | -36.261 | 0.000 | -0.779 | -0.699 |
| ma.S.L14 | -0.1743 | 0.020 | -8.900 | 0.000 | -0.213 | -0.136 |
| sigma2 | 5.331e+04 | 6.27e-08 | 8.5e+11 | 0.000 | 5.33e+04 | 5.33e+04 |
| Ljung-Box (L1) (Q): | 1.75 | Jarque-Bera (JB): | 229.71 |
|---|---|---|---|
| Prob(Q): | 0.19 | Prob(JB): | 0.00 |
| Heteroskedasticity (H): | 1.16 | Skew: | -0.26 |
| Prob(H) (two-sided): | 0.07 | Kurtosis: | 4.69 |
start = len(train_data)
end = start + len(test_data) - 1
fcastSARIMA2 = fitSARIMA2.predict(start = start, end = end, dynamic = False).rename('SARIMA')
print(mean_absolute_percentage_error(test_data, fcastSARIMA2))
1.6985804591923992
data = train_data.reset_index()[['Date', 'First.Time.Visits']]
data.columns = ['ds', 'y']
data
| ds | y | |
|---|---|---|
| 0 | 2015-09-14 | 1524.0 |
| 1 | 2015-09-15 | 647.0 |
| 2 | 2015-09-16 | 550.0 |
| 3 | 2015-09-17 | 409.0 |
| 4 | 2015-09-18 | -106.0 |
| ... | ... | ... |
| 1767 | 2020-07-16 | 103.0 |
| 1768 | 2020-07-17 | -127.0 |
| 1769 | 2020-07-18 | -35.0 |
| 1770 | 2020-07-19 | 859.0 |
| 1771 | 2020-07-20 | 1072.0 |
1772 rows × 2 columns
fitProphet = Prophet(weekly_seasonality = True)
fitProphet.fit(data)
future = fitProphet.make_future_dataframe(len(test_data))
fcastProphet = fitProphet.predict(future)
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
prophetForecast2 = fcastProphet[['ds', 'yhat']].iloc[-len(test_data):]
prophetForecast2.set_index('ds', inplace = True)
prophetForecast2.index.name = 'Date'
prophetForecast2.columns = ['Prophet']
prophetForecast2.index.freq = 'D'
prophetForecast2
| Prophet | |
|---|---|
| Date | |
| 2020-07-21 | 419.519843 |
| 2020-07-22 | 315.548031 |
| 2020-07-23 | 208.350486 |
| 2020-07-24 | -215.755736 |
| 2020-07-25 | -310.931253 |
| 2020-07-26 | 862.572809 |
| 2020-07-27 | 1288.287455 |
| 2020-07-28 | 422.109057 |
| 2020-07-29 | 315.678952 |
| 2020-07-30 | 206.068022 |
| 2020-07-31 | -220.330984 |
| 2020-08-01 | -317.606676 |
| 2020-08-02 | 854.056032 |
| 2020-08-03 | 1278.246640 |
| 2020-08-04 | 410.910644 |
| 2020-08-05 | 303.727776 |
| 2020-08-06 | 193.795583 |
| 2020-08-07 | -232.478902 |
| 2020-08-08 | -329.182632 |
| 2020-08-09 | 843.488629 |
| 2020-08-10 | 1269.101541 |
| 2020-08-11 | 403.567625 |
| 2020-08-12 | 298.522701 |
| 2020-08-13 | 191.011928 |
| 2020-08-14 | -232.616807 |
| 2020-08-15 | -326.514491 |
| 2020-08-16 | 849.056199 |
| 2020-08-17 | 1277.594170 |
| 2020-08-18 | 414.944393 |
| 2020-08-19 | 312.679294 |
mean_absolute_percentage_error(test_data, prophetForecast2['Prophet'])
2.5786664719682793
test_data.plot(legend = True)
fcastHoltWinter2.plot(legend = True)
fcastSARIMA2.plot(legend = True)
prophetForecast2['Prophet'].plot(legend = True)
<AxesSubplot:xlabel='Date'>
#Holt-Winter's method
fitHoltWinter2 = ExponentialSmoothing(df1, trend = 'add', seasonal = 'add', seasonal_periods = 7).fit()
fcastHoltWinter2 = fitHoltWinter2.forecast(30).rename("Holt-Winter's predict")
#SARIMA
fitSARIMA2 = SARIMAX(df1, order = (0, 1, 2), seasonal_order = (1, 0, 2, 7)).fit()
start = len(df1)
end = start + 29
fcastSARIMA2 = fitSARIMA2.predict(start = start, end = end, dynamic = False).rename('SARIMA')
#Prophet
data = df1.reset_index()[['Date', 'First.Time.Visits']]
data.columns = ['ds', 'y']
fitProphet = Prophet(weekly_seasonality = True)
fitProphet.fit(data)
future = fitProphet.make_future_dataframe(30)
fcastProphet = fitProphet.predict(future)
#going back to normal dataframe from prophet
prophetForecast2 = fcastProphet[['ds', 'yhat']].iloc[-30:]
prophetForecast2.set_index('ds', inplace = True)
prophetForecast2.index.name = 'Date'
prophetForecast2.columns = ['Prophet']
prophetForecast2.index.freq = 'D'
c:\Users\Rudy\anaconda3\lib\site-packages\statsmodels\tsa\holtwinters\model.py:427: FutureWarning: After 0.13 initialization must be handled at model creation
warnings.warn(
c:\Users\Rudy\anaconda3\lib\site-packages\statsmodels\tsa\base\tsa_model.py:132: FutureWarning: The 'freq' argument in Timestamp is deprecated and will be removed in a future version.
date_key = Timestamp(key, freq=base_index.freq)
c:\Users\Rudy\anaconda3\lib\site-packages\statsmodels\tsa\statespace\sarimax.py:978: UserWarning: Non-invertible starting MA parameters found. Using zeros as starting parameters.
warn('Non-invertible starting MA parameters found.'
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
df1.tail(365).plot(legend = True)
fcastHoltWinter2.plot(legend = True)
fcastSARIMA2.plot(legend = True)
prophetForecast2['Prophet'].plot(legend = True)
<AxesSubplot:xlabel='Date'>
#HoltWinters
beforeDiff = np.array([])
for i in range(0, len(fcastHoltWinter2)):
beforeDiff = np.append(beforeDiff, df['First.Time.Visits'][-357 + i] + fcastHoltWinter2[i])
beforeDiff = pd.DataFrame(beforeDiff, columns = ['HoltWinter'])
index = pd.date_range(start = "2020-08-20", end = "2020-09-18", freq = "D")
beforeDiff = beforeDiff.set_index(index)
#SARIMA
SARIMADiff = np.array([])
for i in range(0, len(fcastSARIMA2)):
SARIMADiff = np.append(SARIMADiff, df['First.Time.Visits'][-357 + i] + fcastSARIMA2[i])
beforeDiff['SARIMA'] = SARIMADiff
#Prophet
ProphetDiff = np.array([])
for i in range(0, len(prophetForecast2['Prophet'])):
ProphetDiff = np.append(ProphetDiff, df['First.Time.Visits'][-357 + i] + prophetForecast2['Prophet'][i])
beforeDiff['Prophet'] = ProphetDiff
beforeDiff
| HoltWinter | SARIMA | Prophet | |
|---|---|---|---|
| 2020-08-20 | 2009.249523 | 1834.301014 | 2639.760992 |
| 2020-08-21 | 988.999366 | 1003.389958 | 1473.221874 |
| 2020-08-22 | 281.272051 | 348.573573 | 834.209728 |
| 2020-08-23 | 1499.590154 | 1695.074428 | 2245.931276 |
| 2020-08-24 | 2318.389359 | 2524.379699 | 3173.282713 |
| 2020-08-25 | 1942.246271 | 2183.738279 | 2836.679709 |
| 2020-08-26 | 2346.799365 | 2302.187813 | 3143.817358 |
| 2020-08-27 | 2009.587455 | 2036.593985 | 2774.866301 |
| 2020-08-28 | 1264.337298 | 1225.284874 | 1883.735790 |
| 2020-08-29 | 352.609984 | 379.187282 | 1040.619998 |
| 2020-08-30 | 1819.928086 | 2024.736025 | 2700.761893 |
| 2020-08-31 | 3100.727291 | 3398.878093 | 4089.104452 |
| 2020-09-01 | 2173.584203 | 2508.680263 | 3200.118866 |
| 2020-09-02 | 2313.137297 | 2545.538951 | 3240.562477 |
| 2020-09-03 | 2214.925388 | 2413.060372 | 3108.672726 |
| 2020-09-04 | 1197.675231 | 1247.686258 | 1943.427957 |
| 2020-09-05 | 367.947916 | 483.588934 | 1180.091439 |
| 2020-09-06 | 1982.266019 | 2276.134375 | 2984.973025 |
| 2020-09-07 | 3503.065224 | 3890.275260 | 4611.078269 |
| 2020-09-08 | 2595.922135 | 3020.079884 | 3739.934726 |
| 2020-09-09 | 2541.475230 | 2862.938873 | 3584.348245 |
| 2020-09-10 | 2282.263320 | 2569.460577 | 3289.595562 |
| 2020-09-11 | 1391.013163 | 1530.087652 | 2248.684102 |
| 2020-09-12 | 563.285849 | 767.990595 | 1485.895369 |
| 2020-09-13 | 2135.603951 | 2518.532736 | 3247.546358 |
| 2020-09-14 | 3421.403156 | 3897.672438 | 4637.638816 |
| 2020-09-15 | 2517.260068 | 3030.479515 | 3768.686081 |
| 2020-09-16 | 2532.813162 | 2943.338806 | 3682.470262 |
| 2020-09-17 | 2369.601253 | 2745.860792 | 3483.236065 |
| 2020-09-18 | 1339.351095 | 1567.489057 | 2302.952102 |
df['First.Time.Visits'].tail(380).plot(legend = True)
beforeDiff['HoltWinter'].plot(legend = True)
beforeDiff['SARIMA'].plot(legend = True)
beforeDiff['Prophet'].plot(legend = True)
<AxesSubplot:xlabel='Date'>
data = df.iloc[:-30].reset_index()[['Date', 'First.Time.Visits']]
data.columns = ['ds', 'y']
fitProphet = Prophet(weekly_seasonality = True, yearly_seasonality = True)
fitProphet.fit(data)
future = fitProphet.make_future_dataframe(30)
fcastProphet = fitProphet.predict(future)
#going back to normal dataframe from prophet
prophetForecast3 = fcastProphet[['ds', 'yhat']].iloc[-30:]
prophetForecast3.set_index('ds', inplace = True)
prophetForecast3.index.name = 'Date'
prophetForecast3.columns = ['Prophet']
prophetForecast3.index.freq = 'D'
print(mean_absolute_percentage_error(df['First.Time.Visits'][-30:], prophetForecast3['Prophet']))
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
0.12622839895747606
data = df.reset_index()[['Date', 'First.Time.Visits']]
data.columns = ['ds', 'y']
fitProphet = Prophet(weekly_seasonality = True, yearly_seasonality = True)
fitProphet.fit(data)
future = fitProphet.make_future_dataframe(30)
fcastProphet = fitProphet.predict(future)
#going back to normal dataframe from prophet
prophetForecast3 = fcastProphet[['ds', 'yhat']].iloc[-30:]
prophetForecast3.set_index('ds', inplace = True)
prophetForecast3.index.name = 'Date'
prophetForecast3.columns = ['Prophet']
prophetForecast3.index.freq = 'D'
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
df['First.Time.Visits'].tail(380).plot(legend = True)
prophetForecast3['Prophet'].plot(legend = True)
<AxesSubplot:xlabel='Date'>
plt.rc("figure", figsize = (24, 16))
#best prophet
prophetForecast3['Prophet'].plot(label = "Best Prophet")
#weekly resampling
fcastSES.plot()
fcastHolt.plot(label = "Resampling Holt")
fcastHoltWinter.plot(label = "Resampling HoltWinter")
fcastSARIMA.plot(label = "Resampling SARIMA")
prophetForecast['First.Time.Visits'].plot(label = "Resampling Prophet")
#yearly differencing
beforeDiff['HoltWinter'].plot(label = "Differencing HoltWinter")
beforeDiff['SARIMA'].plot(label = "Differencing SARIMA")
beforeDiff['Prophet'].plot(label = "Differencing Prophet")
plt.legend()
<matplotlib.legend.Legend at 0x1811bef1220>
df1 = df['First.Time.Visits']
df1
Date
2014-09-14 1430
2014-09-15 2297
2014-09-16 2352
2014-09-17 2327
2014-09-18 2130
...
2020-08-15 1373
2020-08-16 1686
2020-08-17 2181
2020-08-18 2184
2020-08-19 1297
Freq: D, Name: First.Time.Visits, Length: 2167, dtype: int32
df1.plot()
resultSeasonal = seasonal_decompose(df1, freq = 7)
resultSeasonal.plot();
C:\Users\Rudy\AppData\Local\Temp/ipykernel_12076/3189490167.py:2: FutureWarning: the 'freq'' keyword is deprecated, use 'period' instead resultSeasonal = seasonal_decompose(df1, freq = 7)
auto_arima(df1, seasonal = True, m = 7, trace = True).summary()
Performing stepwise search to minimize aic ARIMA(2,1,2)(1,0,1)[7] intercept : AIC=29239.214, Time=3.51 sec ARIMA(0,1,0)(0,0,0)[7] intercept : AIC=33582.394, Time=0.03 sec ARIMA(1,1,0)(1,0,0)[7] intercept : AIC=29798.887, Time=0.63 sec ARIMA(0,1,1)(0,0,1)[7] intercept : AIC=31761.991, Time=0.72 sec ARIMA(0,1,0)(0,0,0)[7] : AIC=33580.394, Time=0.01 sec ARIMA(2,1,2)(0,0,1)[7] intercept : AIC=30650.623, Time=3.11 sec ARIMA(2,1,2)(1,0,0)[7] intercept : AIC=inf, Time=2.23 sec ARIMA(2,1,2)(2,0,1)[7] intercept : AIC=inf, Time=11.13 sec ARIMA(2,1,2)(1,0,2)[7] intercept : AIC=inf, Time=11.17 sec ARIMA(2,1,2)(0,0,0)[7] intercept : AIC=31477.422, Time=1.01 sec ARIMA(2,1,2)(0,0,2)[7] intercept : AIC=30574.240, Time=6.83 sec ARIMA(2,1,2)(2,0,0)[7] intercept : AIC=inf, Time=4.75 sec ARIMA(2,1,2)(2,0,2)[7] intercept : AIC=inf, Time=12.17 sec ARIMA(1,1,2)(1,0,1)[7] intercept : AIC=29184.157, Time=3.41 sec ARIMA(1,1,2)(0,0,1)[7] intercept : AIC=31224.725, Time=2.10 sec ARIMA(1,1,2)(1,0,0)[7] intercept : AIC=inf, Time=1.87 sec ARIMA(1,1,2)(2,0,1)[7] intercept : AIC=inf, Time=4.24 sec ARIMA(1,1,2)(1,0,2)[7] intercept : AIC=inf, Time=8.72 sec ARIMA(1,1,2)(0,0,0)[7] intercept : AIC=32326.314, Time=0.88 sec ARIMA(1,1,2)(0,0,2)[7] intercept : AIC=30609.381, Time=6.08 sec ARIMA(1,1,2)(2,0,0)[7] intercept : AIC=inf, Time=4.80 sec ARIMA(1,1,2)(2,0,2)[7] intercept : AIC=inf, Time=10.38 sec ARIMA(0,1,2)(1,0,1)[7] intercept : AIC=29206.901, Time=2.86 sec ARIMA(1,1,1)(1,0,1)[7] intercept : AIC=29228.169, Time=3.32 sec ARIMA(1,1,3)(1,0,1)[7] intercept : AIC=inf, Time=3.15 sec ARIMA(0,1,1)(1,0,1)[7] intercept : AIC=29287.818, Time=1.68 sec ARIMA(0,1,3)(1,0,1)[7] intercept : AIC=29184.633, Time=3.32 sec ARIMA(2,1,1)(1,0,1)[7] intercept : AIC=inf, Time=3.23 sec ARIMA(2,1,3)(1,0,1)[7] intercept : AIC=inf, Time=4.38 sec ARIMA(1,1,2)(1,0,1)[7] : AIC=29181.898, Time=1.74 sec ARIMA(1,1,2)(0,0,1)[7] : AIC=31222.727, Time=0.77 sec ARIMA(1,1,2)(1,0,0)[7] : AIC=inf, Time=1.11 sec ARIMA(1,1,2)(2,0,1)[7] : AIC=inf, Time=3.13 sec ARIMA(1,1,2)(1,0,2)[7] : AIC=inf, Time=6.36 sec ARIMA(1,1,2)(0,0,0)[7] : AIC=32324.315, Time=0.39 sec ARIMA(1,1,2)(0,0,2)[7] : AIC=30607.382, Time=1.77 sec ARIMA(1,1,2)(2,0,0)[7] : AIC=inf, Time=3.80 sec ARIMA(1,1,2)(2,0,2)[7] : AIC=inf, Time=7.02 sec ARIMA(0,1,2)(1,0,1)[7] : AIC=29205.132, Time=1.05 sec ARIMA(1,1,1)(1,0,1)[7] : AIC=29244.584, Time=1.83 sec ARIMA(2,1,2)(1,0,1)[7] : AIC=inf, Time=2.89 sec ARIMA(1,1,3)(1,0,1)[7] : AIC=inf, Time=1.73 sec ARIMA(0,1,1)(1,0,1)[7] : AIC=29287.562, Time=0.76 sec ARIMA(0,1,3)(1,0,1)[7] : AIC=29182.928, Time=1.41 sec ARIMA(2,1,1)(1,0,1)[7] : AIC=inf, Time=2.03 sec ARIMA(2,1,3)(1,0,1)[7] : AIC=inf, Time=2.48 sec Best model: ARIMA(1,1,2)(1,0,1)[7] Total fit time: 161.988 seconds
| Dep. Variable: | y | No. Observations: | 2167 |
|---|---|---|---|
| Model: | SARIMAX(1, 1, 2)x(1, 0, [1], 7) | Log Likelihood | -14584.949 |
| Date: | Fri, 13 May 2022 | AIC | 29181.898 |
| Time: | 14:46:31 | BIC | 29215.982 |
| Sample: | 0 | HQIC | 29194.362 |
| - 2167 | |||
| Covariance Type: | opg |
| coef | std err | z | P>|z| | [0.025 | 0.975] | |
|---|---|---|---|---|---|---|
| ar.L1 | 0.3412 | 0.060 | 5.684 | 0.000 | 0.224 | 0.459 |
| ma.L1 | -0.3571 | 0.062 | -5.789 | 0.000 | -0.478 | -0.236 |
| ma.L2 | -0.2031 | 0.020 | -10.095 | 0.000 | -0.243 | -0.164 |
| ar.S.L7 | 1.0000 | 3.67e-05 | 2.72e+04 | 0.000 | 1.000 | 1.000 |
| ma.S.L7 | -0.8162 | 0.012 | -68.249 | 0.000 | -0.840 | -0.793 |
| sigma2 | 4.062e+04 | 848.870 | 47.850 | 0.000 | 3.9e+04 | 4.23e+04 |
| Ljung-Box (L1) (Q): | 0.04 | Jarque-Bera (JB): | 647.34 |
|---|---|---|---|
| Prob(Q): | 0.85 | Prob(JB): | 0.00 |
| Heteroskedasticity (H): | 1.20 | Skew: | 0.06 |
| Prob(H) (two-sided): | 0.01 | Kurtosis: | 5.68 |
train_data = df1.iloc[:-30]
test_data = df1.iloc[-30:]
fitSES3 = SimpleExpSmoothing(train_data).fit()
fcastSES3 = fitSES3.forecast(len(test_data)).rename('SES predict')
print(mean_absolute_percentage_error(test_data, fcastSES3))
0.23695891753210677
c:\Users\Rudy\anaconda3\lib\site-packages\statsmodels\tsa\holtwinters\model.py:427: FutureWarning: After 0.13 initialization must be handled at model creation warnings.warn( c:\Users\Rudy\anaconda3\lib\site-packages\statsmodels\tsa\holtwinters\model.py:920: ConvergenceWarning: Optimization failed to converge. Check mle_retvals. warnings.warn( c:\Users\Rudy\anaconda3\lib\site-packages\statsmodels\tsa\base\tsa_model.py:132: FutureWarning: The 'freq' argument in Timestamp is deprecated and will be removed in a future version. date_key = Timestamp(key, freq=base_index.freq)
fitHolt1_3 = Holt(train_data, exponential = False).fit()
fcastHolt1_3 = fitHolt1_3.forecast(len(test_data)).rename("Holt's predict 1")
#Exponential
fitHolt2_3 = Holt(train_data, exponential = True).fit()
fcastHolt2_3 = fitHolt2_3.forecast(len(test_data)).rename("Holt's predict 2")
#Damped trend
fitHolt3_3 = Holt(train_data, damped_trend = True).fit()
fcastHolt3_3 = fitHolt3_3.forecast(len(test_data)).rename("Holt's predict 3")
#Exponential and damped
fitHolt4_3 = Holt(train_data, exponential = True, damped_trend = True).fit()
fcastHolt4_3 = fitHolt4_3.forecast(len(test_data)).rename("Holt's predict 4")
#finding out the best one based on MAPE
print(mean_absolute_percentage_error(test_data, fcastHolt1_3))
print(mean_absolute_percentage_error(test_data, fcastHolt2_3))
print(mean_absolute_percentage_error(test_data, fcastHolt3_3))
print(mean_absolute_percentage_error(test_data, fcastHolt4_3))
c:\Users\Rudy\anaconda3\lib\site-packages\statsmodels\tsa\holtwinters\model.py:427: FutureWarning: After 0.13 initialization must be handled at model creation warnings.warn( c:\Users\Rudy\anaconda3\lib\site-packages\statsmodels\tsa\holtwinters\model.py:920: ConvergenceWarning: Optimization failed to converge. Check mle_retvals. warnings.warn( c:\Users\Rudy\anaconda3\lib\site-packages\statsmodels\tsa\base\tsa_model.py:132: FutureWarning: The 'freq' argument in Timestamp is deprecated and will be removed in a future version. date_key = Timestamp(key, freq=base_index.freq) c:\Users\Rudy\anaconda3\lib\site-packages\statsmodels\tsa\holtwinters\model.py:80: RuntimeWarning: overflow encountered in matmul return err.T @ err c:\Users\Rudy\anaconda3\lib\site-packages\statsmodels\tsa\holtwinters\model.py:920: ConvergenceWarning: Optimization failed to converge. Check mle_retvals. warnings.warn( c:\Users\Rudy\anaconda3\lib\site-packages\statsmodels\tsa\holtwinters\model.py:920: ConvergenceWarning: Optimization failed to converge. Check mle_retvals. warnings.warn( c:\Users\Rudy\anaconda3\lib\site-packages\statsmodels\tsa\holtwinters\model.py:920: ConvergenceWarning: Optimization failed to converge. Check mle_retvals. warnings.warn(
0.20599586600524325 0.6899685099415768 0.2228381620874419 0.45746085133639824
#multiplicative for both
fitHoltWinter1_3 = ExponentialSmoothing(train_data, trend = 'mul', seasonal = 'mul', seasonal_periods = 7).fit()
fcastHoltWinter1_3 = fitHoltWinter1_3.forecast(len(test_data)).rename("Holt-Winter's predict 1")
#mul trend, add seasonal
fitHoltWinter2_3 = ExponentialSmoothing(train_data, trend = 'mul', seasonal = 'add', seasonal_periods = 7).fit()
fcastHoltWinter2_3 = fitHoltWinter2_3.forecast(len(test_data)).rename("Holt-Winter's predict 2")
#add trend, mul seasonal
fitHoltWinter3_3 = ExponentialSmoothing(train_data, trend = 'add', seasonal = 'mul', seasonal_periods = 7).fit()
fcastHoltWinter3_3 = fitHoltWinter3_3.forecast(len(test_data)).rename("Holt-Winter's predict 3")
#both additive
fitHoltWinter4_3 = ExponentialSmoothing(train_data, trend = 'add', seasonal = 'add', seasonal_periods = 7).fit()
fcastHoltWinter4_3 = fitHoltWinter4_3.forecast(len(test_data)).rename("Holt-Winter's predict 4")
print(mean_absolute_percentage_error(test_data, fcastHoltWinter1_3))
print(mean_absolute_percentage_error(test_data, fcastHoltWinter2_3))
print(mean_absolute_percentage_error(test_data, fcastHoltWinter3_3))
print(mean_absolute_percentage_error(test_data, fcastHoltWinter4_3))
c:\Users\Rudy\anaconda3\lib\site-packages\statsmodels\tsa\holtwinters\model.py:427: FutureWarning: After 0.13 initialization must be handled at model creation warnings.warn( c:\Users\Rudy\anaconda3\lib\site-packages\statsmodels\tsa\holtwinters\model.py:80: RuntimeWarning: overflow encountered in matmul return err.T @ err c:\Users\Rudy\anaconda3\lib\site-packages\statsmodels\tsa\base\tsa_model.py:132: FutureWarning: The 'freq' argument in Timestamp is deprecated and will be removed in a future version. date_key = Timestamp(key, freq=base_index.freq) c:\Users\Rudy\anaconda3\lib\site-packages\statsmodels\tsa\holtwinters\model.py:920: ConvergenceWarning: Optimization failed to converge. Check mle_retvals. warnings.warn( c:\Users\Rudy\anaconda3\lib\site-packages\statsmodels\tsa\holtwinters\model.py:920: ConvergenceWarning: Optimization failed to converge. Check mle_retvals. warnings.warn(
0.07963086827853723 0.14322417663506207 0.057614432573351505 0.21759981970645764
c:\Users\Rudy\anaconda3\lib\site-packages\statsmodels\tsa\holtwinters\model.py:920: ConvergenceWarning: Optimization failed to converge. Check mle_retvals. warnings.warn(
fitSARIMA3 = SARIMAX(train_data, order = (1, 1, 2), seasonal_order = (1, 0, 1, 7)).fit()
fitSARIMA3.summary()
c:\Users\Rudy\anaconda3\lib\site-packages\statsmodels\tsa\statespace\sarimax.py:978: UserWarning: Non-invertible starting MA parameters found. Using zeros as starting parameters.
warn('Non-invertible starting MA parameters found.'
| Dep. Variable: | First.Time.Visits | No. Observations: | 2137 |
|---|---|---|---|
| Model: | SARIMAX(1, 1, 2)x(1, 0, [1], 7) | Log Likelihood | -14382.328 |
| Date: | Fri, 13 May 2022 | AIC | 28776.657 |
| Time: | 14:46:34 | BIC | 28810.657 |
| Sample: | 09-14-2014 | HQIC | 28789.099 |
| - 07-20-2020 | |||
| Covariance Type: | opg |
| coef | std err | z | P>|z| | [0.025 | 0.975] | |
|---|---|---|---|---|---|---|
| ar.L1 | 0.3416 | 0.060 | 5.684 | 0.000 | 0.224 | 0.459 |
| ma.L1 | -0.3561 | 0.062 | -5.765 | 0.000 | -0.477 | -0.235 |
| ma.L2 | -0.2039 | 0.020 | -10.094 | 0.000 | -0.243 | -0.164 |
| ar.S.L7 | 1.0000 | 3.58e-05 | 2.8e+04 | 0.000 | 1.000 | 1.000 |
| ma.S.L7 | -0.8140 | 0.012 | -67.708 | 0.000 | -0.838 | -0.790 |
| sigma2 | 4.058e+04 | 873.729 | 46.441 | 0.000 | 3.89e+04 | 4.23e+04 |
| Ljung-Box (L1) (Q): | 0.04 | Jarque-Bera (JB): | 564.51 |
|---|---|---|---|
| Prob(Q): | 0.84 | Prob(JB): | 0.00 |
| Heteroskedasticity (H): | 1.18 | Skew: | 0.11 |
| Prob(H) (two-sided): | 0.03 | Kurtosis: | 5.51 |
start = len(train_data)
end = start + len(test_data) - 1
fcastSARIMA3 = fitSARIMA3.predict(start = start, end = end, dynamic = False).rename('SARIMA')
print(mean_absolute_percentage_error(test_data, fcastSARIMA3))
0.08573595227246479
data = train_data.reset_index()[['Date', 'First.Time.Visits']]
data.columns = ['ds', 'y']
data
| ds | y | |
|---|---|---|
| 0 | 2014-09-14 | 1430 |
| 1 | 2014-09-15 | 2297 |
| 2 | 2014-09-16 | 2352 |
| 3 | 2014-09-17 | 2327 |
| 4 | 2014-09-18 | 2130 |
| ... | ... | ... |
| 2132 | 2020-07-16 | 2463 |
| 2133 | 2020-07-17 | 2122 |
| 2134 | 2020-07-18 | 1606 |
| 2135 | 2020-07-19 | 1892 |
| 2136 | 2020-07-20 | 2428 |
2137 rows × 2 columns
fitProphet = Prophet(weekly_seasonality = True)
fitProphet.fit(data)
future = fitProphet.make_future_dataframe(len(test_data))
fcastProphet = fitProphet.predict(future)
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
prophetForecast3 = fcastProphet[['ds', 'yhat']].iloc[-len(test_data):]
prophetForecast3.set_index('ds', inplace = True)
prophetForecast3.index.name = 'Date'
prophetForecast3.columns = ['Prophet']
prophetForecast3.index.freq = 'D'
prophetForecast3
| Prophet | |
|---|---|
| Date | |
| 2020-07-21 | 2718.915889 |
| 2020-07-22 | 2686.036950 |
| 2020-07-23 | 2530.421491 |
| 2020-07-24 | 1940.790216 |
| 2020-07-25 | 1248.172355 |
| 2020-07-26 | 1719.798283 |
| 2020-07-27 | 2627.903236 |
| 2020-07-28 | 2695.068820 |
| 2020-07-29 | 2659.032941 |
| 2020-07-30 | 2501.708639 |
| 2020-07-31 | 1911.812035 |
| 2020-08-01 | 1220.316234 |
| 2020-08-02 | 1694.347974 |
| 2020-08-03 | 2605.996923 |
| 2020-08-04 | 2677.664100 |
| 2020-08-05 | 2646.879807 |
| 2020-08-06 | 2495.331249 |
| 2020-08-07 | 1911.499766 |
| 2020-08-08 | 1226.124179 |
| 2020-08-09 | 1706.106721 |
| 2020-08-10 | 2623.331130 |
| 2020-08-11 | 2700.019086 |
| 2020-08-12 | 2673.555100 |
| 2020-08-13 | 2525.519795 |
| 2020-08-14 | 1944.331312 |
| 2020-08-15 | 1260.711192 |
| 2020-08-16 | 1741.591135 |
| 2020-08-17 | 2658.930160 |
| 2020-08-18 | 2735.068400 |
| 2020-08-19 | 2707.547705 |
mean_absolute_percentage_error(test_data, prophetForecast3['Prophet'])
0.12622839895747606
test_data.plot(legend = True)
fcastHoltWinter3_3.plot(legend = True)
fcastSARIMA3.plot(legend = True)
prophetForecast3['Prophet'].plot(legend = True)
<AxesSubplot:xlabel='Date'>
#Holt-Winter's method
fitHoltWinter3_3 = ExponentialSmoothing(df1, trend = 'add', seasonal = 'mul', seasonal_periods = 7).fit()
fcastHoltWinter3_3 = fitHoltWinter3_3.forecast(30).rename("Holt-Winter's predict")
#SARIMA
fitSARIMA3 = SARIMAX(df1, order = (1, 1, 2), seasonal_order = (1, 0, 1, 7)).fit()
start = len(df1)
end = start + 29
fcastSARIMA3 = fitSARIMA3.predict(start = start, end = end, dynamic = False).rename('SARIMA')
#Prophet
data = df1.reset_index()[['Date', 'First.Time.Visits']]
data.columns = ['ds', 'y']
fitProphet = Prophet(weekly_seasonality = True)
fitProphet.fit(data)
future = fitProphet.make_future_dataframe(30)
fcastProphet = fitProphet.predict(future)
#going back to normal dataframe from prophet
prophetForecast3 = fcastProphet[['ds', 'yhat']].iloc[-30:]
prophetForecast3.set_index('ds', inplace = True)
prophetForecast3.index.name = 'Date'
prophetForecast3.columns = ['Prophet']
prophetForecast3.index.freq = 'D'
c:\Users\Rudy\anaconda3\lib\site-packages\statsmodels\tsa\holtwinters\model.py:427: FutureWarning: After 0.13 initialization must be handled at model creation
warnings.warn(
c:\Users\Rudy\anaconda3\lib\site-packages\statsmodels\tsa\holtwinters\model.py:920: ConvergenceWarning: Optimization failed to converge. Check mle_retvals.
warnings.warn(
c:\Users\Rudy\anaconda3\lib\site-packages\statsmodels\tsa\base\tsa_model.py:132: FutureWarning: The 'freq' argument in Timestamp is deprecated and will be removed in a future version.
date_key = Timestamp(key, freq=base_index.freq)
c:\Users\Rudy\anaconda3\lib\site-packages\statsmodels\tsa\statespace\sarimax.py:978: UserWarning: Non-invertible starting MA parameters found. Using zeros as starting parameters.
warn('Non-invertible starting MA parameters found.'
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
df1.tail(365).plot(legend = True)
fcastHoltWinter3_3.plot(legend = True)
fcastSARIMA3.plot(legend = True)
prophetForecast3['Prophet'].plot(legend = True)
<AxesSubplot:xlabel='Date'>